import jieba
import xlwt
import os

# 导入停顿词
stopwords = {}.fromkeys([line.rstrip() for line in open('cn_stopwords.txt', encoding='utf-8')])


# 加载txt列表寻找关键词并保存到excel
def matchKeyWords(ThePath, keyWords, aim_path):
    dir_list = os.listdir(ThePath)
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = book.add_sheet('关键词词频统计', cell_overwrite_ok=True)
    sheet.write(0, 0, '代码')
    sheet.write(0, 1, '简称')
    sheet.write(0, 2, '年份')
    sheet.write(0, 3, '总词数')
    for i, c_word in enumerate(keyWords):
        sheet.write(0, i + 4, c_word)
    index = 0
    files = os.listdir(ThePath)
    for file in files:
        if os.path.splitext(file)[-1] == ".txt":
            txt_path = os.path.join(ThePath, file)
            stock_code = file.split("_")[0]
            stock_name = file.split("_")[2]
            year = file.split("_")[1]
            sheet.write(index + 1, 0, stock_code)
            sheet.write(index + 1, 1, stock_name)
            sheet.write(index + 1, 2, year)
            print(f'正在统计{file}')
            with open(txt_path, "r", encoding='utf-8', errors='ignore') as fp:
                text = fp.read()
                words_list = list(jieba.cut(text))  # jieba分词
                words_list = [word for word in words_list if word not in stopwords]  # 去除停顿词
                total_words = len(words_list)  # 计算总词数
                sheet.write(index + 1, 3, str(total_words))
                for ind, word in enumerate(keyWords):
                    word_freq = text.count(word)
                    sheet.write(index + 1, ind + 4, str(word_freq))
            index += 1
    book.save(aim_path)


ThePath = r'G:\年报\年报TXT版'  # 年报所在文件夹
aim_path = r'G:\年报\词频统计'  # 词频统计数据存放文件夹
keywords = ['营业收入', '估值', '资产', '股东', '智能数据分析', '智能机器人', '机器学习', '深度学习']  # 所要进行统计的关键词
matchKeyWords(ThePath, keywords, f'{aim_path}\词频统计.xls')